mappings_in_path <- here("data/processed/childes/all_types_norm_mappings.csv")
tokens_in_path <- here("data/processed/childes/all_tokens_post-norm.csv")
mappings <- read_csv(mappings_in_path)
tokens_raw <- read_csv(tokens_in_path)
Grouping by corpus AND child
by_kid1 <- tokens_raw %>%
#mutate(., child_id = group_indices(., corpus, child)) %>%
group_by(corpus, child) %>%
mutate(corpus_child = paste(corpus, child, sep = "_")) %>%
ungroup()
by_kid2 <- by_kid1 %>%
dplyr::select(corpus_child, word) %>%
group_by(corpus_child) %>%
add_count(word) %>%
distinct(corpus_child, word, .keep_all = TRUE) %>%
ungroup()
by_kid3 <- by_kid2 %>%
group_by(corpus_child) %>%
add_tally(n) %>%
ungroup() %>%
rename(N = nn) %>%
mutate(freq = n/N,
trns_freq = log10(freq + 1))
td_matrix_bykid <- by_kid3 %>%
dplyr::select(word, corpus_child, trns_freq) %>%
spread(key = corpus_child, value = trns_freq, fill = 0)
M_bykid <- td_matrix_bykid %>%
dplyr::select(-word) %>%
cor()
corrplot(round(M_bykid[1:10, 1:10], 2), method = 'number', tl.srt = 45)

nm_mds_bykid <- isoMDS(d = 1 - M_bykid, k = 2)
## initial value 29.894866
## iter 5 value 19.726697
## iter 10 value 16.273466
## iter 15 value 15.962877
## iter 20 value 15.806430
## iter 20 value 15.790962
## iter 20 value 15.779068
## final value 15.779068
## converged
coords_bykid <- nm_mds_bykid$points %>%
as.data.frame() %>%
rename(x = V1, y = V2) %>%
rownames_to_column(var = "corpus_child") %>%
separate(corpus_child, c("corpus", "child"), sep = "_", remove = FALSE)
coords_bykid <- coords_bykid %>%
mutate(corpus = ifelse(corpus == "MacWhinney", "McW",
ifelse(corpus == "EllisWeismer", "EW",
corpus)))
ggplot(coords_bykid, aes(x, y, label = corpus, color = corpus)) +
geom_text() +
theme_minimal() +
guides(color = FALSE)

coords_bykid %>%
mutate(corpus = ifelse(corpus_child %in% c("Cornell_Felicia", "Cornell_Rhonda", "Warren_Jmarkey",
"Cornell_Sarah", "Warren_Gina", "Warren_Mary",
"Warren_David", "Warren_Louise"), "REMOVE", corpus)) %>%
ggplot(aes(x, y, label = corpus, color = corpus)) +
geom_text() +
theme_minimal() +
guides(color = FALSE)

# ggplot(coords_bykid, aes(x, y, label = corpus, color = corpus)) +
# geom_text(size = 3.5) +
# xlim(-0.75, 1) +
# ylim(-0.75, 1) +
# theme_minimal() +
# guides(color = FALSE) +
# labs(title = "ZOOMED IN")
Grouping by corpus only - remove none
by_corpus1 <- tokens_raw %>%
dplyr::select(corpus, word) %>%
group_by(corpus) %>%
add_count(word) %>%
distinct(corpus, word, .keep_all = TRUE) %>%
ungroup()
by_corpus2 <- by_corpus1 %>%
group_by(corpus) %>%
add_tally(n) %>%
ungroup() %>%
rename(N = nn) %>%
mutate(freq = n/N,
trns_freq = log10(freq + 1))
td_matrix_bycorpus <- by_corpus2 %>%
dplyr::select(word, corpus, trns_freq) %>%
spread(key = corpus, value = trns_freq, fill = 0)
M_bycorpus <- td_matrix_bycorpus %>%
dplyr::select(-word) %>%
cor()
corrplot(round(M_bycorpus, 2), method = 'number', tl.srt = 45)

nm_mds_bycorpus <- isoMDS(d = 1 - M_bycorpus, k = 2)
## initial value 18.171402
## iter 5 value 14.421080
## iter 10 value 13.116299
## iter 15 value 12.256268
## iter 15 value 12.247430
## iter 15 value 12.247430
## final value 12.247430
## converged
coords_bycorpus <- nm_mds_bycorpus$points %>%
as.data.frame() %>%
rename(x = V1, y = V2) %>%
rownames_to_column(var = "corpus")
ggplot(coords_bycorpus, aes(x, y, label = corpus, color = corpus)) +
geom_point(alpha = 0.7) +
geom_text_repel() +
theme_minimal() +
guides(color = FALSE)

Grouping by corpus only - remove outliers
outliers <- c("Cornell_Felicia", "Cornell_Rhonda", "Warren_Jmarkey",
"Cornell_Sarah", "Warren_Gina", "Warren_Mary",
"Warren_David", "Warren_Louise")
removed1 <- tokens_raw %>%
filter(!paste(corpus, child, sep = "_") %in% outliers) %>%
dplyr::select(corpus, word) %>%
group_by(corpus) %>%
add_count(word) %>%
distinct(corpus, word, .keep_all = TRUE) %>%
ungroup()
removed2 <- removed1 %>%
group_by(corpus) %>%
add_tally(n) %>%
ungroup() %>%
rename(N = nn) %>%
mutate(freq = n/N,
trns_freq = log10(freq + 1))
td_matrix_removed <- removed2 %>%
dplyr::select(word, corpus, trns_freq) %>%
spread(key = corpus, value = trns_freq, fill = 0)
M_removed <- td_matrix_removed %>%
dplyr::select(-word) %>%
cor()
corrplot(round(M_removed, 2), method = 'number', tl.srt = 45)

nm_mds_removed <- isoMDS(d = 1 - M_removed, k = 2)
## initial value 20.851396
## iter 5 value 17.361216
## iter 10 value 13.896642
## iter 15 value 13.574920
## iter 20 value 12.554707
## final value 12.386920
## converged
coords_removed <- nm_mds_removed$points %>%
as.data.frame() %>%
rename(x = V1, y = V2) %>%
rownames_to_column(var = "corpus")
ggplot(coords_removed, aes(x, y, label = corpus, color = corpus)) +
geom_point(alpha = 0.7) +
geom_text_repel() +
theme_minimal() +
guides(color = FALSE)

Visualizations
corpus

corpus + child

corpus + child outliers removed
